{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/nimesh/anaconda/lib/python2.7/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np \n",
    "import pandas as pd \n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Embedding, LSTM\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.utils.np_utils import to_categorical\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "def convert(x):\n",
    "    ob = json.loads(x)\n",
    "    for k, v in ob.items():\n",
    "        if isinstance(v, list):\n",
    "            ob[k] = ','.join(v)\n",
    "        elif isinstance(v, dict):\n",
    "            for kk, vv in v.items():\n",
    "                ob['%s_%s' % (k, kk)] = vv\n",
    "            del ob[k]\n",
    "    return ob\n",
    "json_filename='/Users/nimesh/Downloads/dataset/review.json'\n",
    "with open(json_filename,'rb') as f:\n",
    "    data = f.readlines()\n",
    "\n",
    "df = pd.DataFrame([convert(line) for line in data])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = df[['text', 'stars']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I have taken the ratings above 3 as positive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['sentiment']=['pos' if (x>3) else 'neg' for x in data['stars']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['text']= [x.lower() for x in data['text']]\n",
    "data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]','',x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for idx,row in data.iterrows():\n",
    "    row[0] = row[0].replace('rt',' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>stars</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2368526</th>\n",
       "      <td>I have been a Morenos customer since they day they opened \\n\\nWhen it opened the restaurant was pretty very awkward It was confusing if you were supposed to sit down walk up and order bus your own plates It made the restaurant uncomfortable\\n\\nHowever the food was fantastic I used to live across the street from the restaurant and ate their a few time a week The poritions were large and everything I ate was great Chicken cabeza lengua all fantastic Asada was not the best but it was still good The salsas were good but nothing memorable Rice and beans were always delicious Even the shrimp burrito was tasty\\n\\nEverything was great for the first year or so of the restaurant\\n\\nRecently they have made some refinements and it is just not as good While they have removed the akwardness from the restaurant they have reduced portion sizes and raised prices \\n\\nStill a good restaurant but not like it used to be\\n\\nDrinks are still VERY cheap at Morenos It was always hard for me to leave without knocking back a few Dos Equis</td>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3164135</th>\n",
       "      <td>The best in Wisconsin and being the dairy state thats saying a lot The ice cream is a great consistency and there are so many creative flavors like cake batter fudge zanziberry shortcake and sht just got serious Staff tends to be local kids which gives a nice friendly tone to the shop They also sell fudge candy and other treats  This is a must stop in Madison for some good ice cream</td>\n",
       "      <td>5</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3299840</th>\n",
       "      <td>Jeff takes care of three yards in my neighborhood They all look great all the time If you need lawn service at a fair price xericare is it</td>\n",
       "      <td>5</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1216381</th>\n",
       "      <td>This is my Walmart of choice Is it busy yes as it is located by a couple of different 55 neighborhoods This store is clean well lit well stocked and the employees are friendly</td>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2854139</th>\n",
       "      <td>My husband has been talking about wanting to go to this place for several years We live about an hour away We actually had a day off yesterday so we drove up to try it out We sat outside on the porch which is really nice We had a table of five and we each ordered something different We had the Philly cheese pizza chicken pot pie pizza spinache pizza a supreme pizza and calzones We also got some cheesy bread and onion rings for an appetizer It was nice because we each got a small pizza and then try to each others They were all amazing Philly cheese was probably my favorite A couple people at our table got a berry cobbler also and that was sooooo yummy Took a while for our food to come out but they were busy so its understandable We will definitely be back</td>\n",
       "      <td>5</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        text  \\\n",
       "2368526  I have been a Morenos customer since they day they opened \\n\\nWhen it opened the restaurant was pretty very awkward It was confusing if you were supposed to sit down walk up and order bus your own plates It made the restaurant uncomfortable\\n\\nHowever the food was fantastic I used to live across the street from the restaurant and ate their a few time a week The poritions were large and everything I ate was great Chicken cabeza lengua all fantastic Asada was not the best but it was still good The salsas were good but nothing memorable Rice and beans were always delicious Even the shrimp burrito was tasty\\n\\nEverything was great for the first year or so of the restaurant\\n\\nRecently they have made some refinements and it is just not as good While they have removed the akwardness from the restaurant they have reduced portion sizes and raised prices \\n\\nStill a good restaurant but not like it used to be\\n\\nDrinks are still VERY cheap at Morenos It was always hard for me to leave without knocking back a few Dos Equis   \n",
       "3164135  The best in Wisconsin and being the dairy state thats saying a lot The ice cream is a great consistency and there are so many creative flavors like cake batter fudge zanziberry shortcake and sht just got serious Staff tends to be local kids which gives a nice friendly tone to the shop They also sell fudge candy and other treats  This is a must stop in Madison for some good ice cream                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     \n",
       "3299840  Jeff takes care of three yards in my neighborhood They all look great all the time If you need lawn service at a fair price xericare is it                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            \n",
       "1216381  This is my Walmart of choice Is it busy yes as it is located by a couple of different 55 neighborhoods This store is clean well lit well stocked and the employees are friendly                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       \n",
       "2854139  My husband has been talking about wanting to go to this place for several years We live about an hour away We actually had a day off yesterday so we drove up to try it out We sat outside on the porch which is really nice We had a table of five and we each ordered something different We had the Philly cheese pizza chicken pot pie pizza spinache pizza a supreme pizza and calzones We also got some cheesy bread and onion rings for an appetizer It was nice because we each got a small pizza and then try to each others They were all amazing Philly cheese was probably my favorite A couple people at our table got a berry cobbler also and that was sooooo yummy Took a while for our food to come out but they were busy so its understandable We will definitely be back                                                                                                                                                                                                                                                                          \n",
       "\n",
       "         stars sentiment  \n",
       "2368526  3      neg       \n",
       "3164135  5      pos       \n",
       "3299840  5      pos       \n",
       "1216381  4      pos       \n",
       "2854139  5      pos       "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.set_option('display.max_colwidth',-1)\n",
    "data[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "text         object\n",
       "stars        int64 \n",
       "sentiment    object\n",
       "dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['text']= [x.encode('ascii') for x in data['text']]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Vectorizing the text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(nb_words=2000, filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n',\n",
    "                                   lower=True,split=' ')\n",
    "tokenizer.fit_on_texts(data['text'].values)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#print(tokenizer.word_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<keras.preprocessing.text.Tokenizer at 0x15c57ffd0>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = tokenizer.texts_to_sequences(data['text'].values)\n",
    "X = pad_sequences(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "47368"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/nimesh/anaconda/lib/python2.7/site-packages/ipykernel_launcher.py:6: UserWarning: The `dropout` argument is no longer support in `Embedding`. You can apply a `keras.layers.SpatialDropout1D` layer right after the `Embedding` layer to get the same behavior.\n",
      "  \n",
      "/Users/nimesh/anaconda/lib/python2.7/site-packages/ipykernel_launcher.py:7: UserWarning: Update your `LSTM` call to the Keras 2 API: `LSTM(196, dropout=0.2, recurrent_dropout=0.2)`\n",
      "  import sys\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_4 (Embedding)      (None, 879, 128)          256000    \n",
      "_________________________________________________________________\n",
      "lstm_4 (LSTM)                (None, 196)               254800    \n",
      "_________________________________________________________________\n",
      "dense_4 (Dense)              (None, 2)                 394       \n",
      "=================================================================\n",
      "Total params: 511,194\n",
      "Trainable params: 511,194\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "embed_dim = 128\n",
    "lstm_out = 196\n",
    "batch_size = 32\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(2000, embed_dim,input_length = X.shape[1], dropout = 0.2))\n",
    "model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))\n",
    "model.add(Dense(2,activation='softmax'))\n",
    "model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])\n",
    "print(model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "((37894, 879), (37894, 2))\n",
      "((9474, 879), (9474, 2))\n"
     ]
    }
   ],
   "source": [
    "Y = pd.get_dummies(data['sentiment']).values\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)\n",
    "print(X_train.shape,Y_train.shape)\n",
    "print(X_test.shape,Y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/1\n",
      " - 2563s - loss: 0.4319 - acc: 0.8053\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x2288edf90>"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X_train, Y_train, batch_size = batch_size, nb_epoch = 1, verbose = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<keras.models.Sequential at 0x17bc01e50>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.34\n",
      "Validation Accuracy: 0.86\n"
     ]
    }
   ],
   "source": [
    "score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)\n",
    "print(\"Score: %.2f\" % (score))\n",
    "print(\"Validation Accuracy: %.2f\" % (acc))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
